---
title: 'Replication for "Thin-skinned Leaders: Regime Legitimation, Protest Issues
  and Repression in Autocracies"'
subtitle: 'Data preparation script'  
author: "Eda Keremoglu, Sebastian Hellmeier, Nils B. Weidmann"
date: "2020"
output: html_document
---

```{r setup, include=FALSE, warning=FALSE}
knitr::opts_chunk$set(echo = T, include = T, warning = F)
```

load packages
```{r}
library(tidyverse)
library(stringi)
library(quanteda)
library(BTM)
library(textmineR)
library(boRingTrees)
library(data.table)
library(haven)
```

load MMAD data (contains only anti-governments event)
```{r}
load("mmad_events.Rda")
```

prepare MMAD text data on protest issues for topic model
```{r}
# delete "for" and "against" indicators as well as brackets
to_delete <- c('-|for:|against:|;|:|\\{|\\}|"|\\,|\'')
mmad_events$all_issues <-  str_replace_all(mmad_events$all_issues, pattern = to_delete, replacement = " ")

# trim white spaces
mmad_events$all_issues <- stri_trim(mmad_events$all_issues)

# convert to lowercase if word is president
mmad_events$all_issues <- gsub("President", "president", mmad_events$all_issues)

# remove the word year
mmad_events$all_issues <- gsub("year", "", mmad_events$all_issues)

# remove names of countries and presidents and UN etc.
mmad_events$all_issues <- gsub("[A-Z][a-z]*", "", mmad_events$all_issues)

# convert to lowercase
mmad_events$all_issues <- stri_trans_tolower(mmad_events$all_issues)

# remove all numbers
mmad_events$all_issues <-gsub("\\d", "", mmad_events$all_issues)

# remove all very short words
mmad_events$all_issues <-gsub('\\b\\w{1,2}\\s', "", mmad_events$all_issues)

# create a tokens object for each event
tokens <- tokens(mmad_events$all_issues)

# remove english stopwords
sw <- stopwords(language = "en")
tokens <- tokens_remove(tokens, sw)

# stem words
tokens <- tokens_wordstem(tokens)

# convert to matrix
tokens_matrix <- dfm(tokens)

# convert to data frame
tokens_data <- convert(tokens_matrix, to = "data.frame")

# from wide to long
tokens_data_clean <- set_tidy_names(tokens_data)

tokens_data_clean <- tokens_data_clean %>%
 mutate_if(is.numeric, list(~ na_if(., 0)))

# transform from wide to long
tokens_long <-  gather(tokens_data_clean, key = "token", value = "ocurrence", na.rm = T, extrem:copyright)

# extract text id and sort
tokens_long$doc_id <- as.numeric(gsub("text", "", tokens_long[,1]))
tokens_long <-  tokens_long[order(tokens_long$doc_id),] 

# change column name
colnames(tokens_long)[colnames(tokens_long)=="doc_id"] <- "event_id"

# subset for topic modelling
tokens_final <- tokens_long[, c("event_id", "token")]
```

run and inspect model
```{r}
set.seed(1235)

# run model
model_BTM <- BTM(tokens_final, k = 8, beta = 0.01, iter = 1000, trace = 100, background = T)

# save model 
save(model_BTM, file = "model.Rds")

# Basis for Table 1
# inspect model
model_BTM
topicterms <- terms(model_BTM, top_n = 10)
topicterms

# Extract topicterms to dataframe
repression <- topicterms[[2]] %>% 
  mutate(topic = "repression")
governance <- topicterms[[3]] %>% 
  mutate(topic = "governance")
incumbents <- topicterms[[4]] %>% 
  mutate(topic = "incumbents")
elections <- topicterms[[5]] %>% 
  mutate(topic = "elections")
opposition <- topicterms[[6]] %>% 
  mutate(topic = "opposition")
regime <- topicterms[[7]] %>% 
  mutate(topic = "regime")
economy <- topicterms[[8]] %>% 
  mutate(topic = "economy")

# combine all to one data frame
topics_summary_BTM <- rbind(repression, governance, incumbents, elections, opposition, regime, economy) %>% 
  group_by(topic) %>% 
  summarise(terms = paste0(token, collapse = ", "))

# save summary as csv file
write_csv(topics_summary_BTM, "summary_BTM.csv")

# predict scores
scores <- predict(model_BTM, newdata = tokens_final, type = "sum_b")
scores <- as.data.frame(scores)

# add row names as variable
scores <- data.frame(event_id = as.numeric(row.names(scores)), scores)

# change column names
names(scores) <- gsub(x = names(scores), pattern = "V", replacement = "topic")

# merge mmad events and topics
mmad_events_btm <- merge(mmad_events, scores, by=c("event_id"), all.x = T)
```


Alternative topic model
```{r, echo=FALSE, include=T, eval=T}
# load mmad_events
load("mmad_events.Rda")

# Code inspired by vignette from Jones (https://cran.r-project.org/web/packages/textmineR/vignettes/c_topic_modeling.html)

# delete "for" and "against" indicators as well as brackets
to_delete <- c('-|for:|against:|;|:|\\{|\\}|"|\\,|\'')
mmad_events$all_issues <-  str_replace_all(mmad_events$all_issues, pattern = to_delete, replacement = " ")

# trim white spaces
mmad_events$all_issues <- stri_trim(mmad_events$all_issues)

# convert to lowercase if word is president
mmad_events$all_issues <- gsub("President", "president", mmad_events$all_issues)

# remove the word year
mmad_events$all_issues <- gsub("year", "", mmad_events$all_issues)

# remove names of countries and presidents and UN etc.
mmad_events$all_issues <- gsub("[A-Z][a-z]*", "", mmad_events$all_issues)

# convert to lowercase
mmad_events$all_issues <- stri_trans_tolower(mmad_events$all_issues)

# remove all numbers
mmad_events$all_issues <-gsub("\\d", "", mmad_events$all_issues)

#remove all very short words
mmad_events$all_issues <-gsub('\\b\\w{1,2}\\s', "", mmad_events$all_issues)

# create a document term matrix 
dtm <- CreateDtm(doc_vec = mmad_events$all_issues, 
                 doc_names = mmad_events$event_id, 
                 ngram_window = c(1, 2), 
                 stopword_vec = c(stopwords::stopwords("en"), 
                                  stopwords::stopwords(source = "smart")), 
                 lower = TRUE, 
                 remove_punctuation = TRUE, 
                 remove_numbers = TRUE, 
                 verbose = T, 
                 cpus = 2,  
                 stem_lemma_function = function(x) SnowballC::wordStem(x, "porter"))

dtm <- dtm[,colSums(dtm) > 2]

set.seed(12082020)

# Fit LDA model
model_LDA <- FitLdaModel(dtm = dtm, 
                     k = 7,
                     iterations = 750, 
                     burnin = 180,
                     alpha = 0.1,
                     beta = 0.05,
                     optimize_alpha = TRUE,
                     calc_likelihood = TRUE,
                     calc_coherence = TRUE,
                     calc_r2 = TRUE,
                     cpus = 2) 

str(model_LDA)

# R-squared 
model_LDA$r2

summary(model_LDA$coherence)

# get the top terms of each topic
model_LDA$top_terms <- GetTopTerms(phi = model_LDA$phi, M = 10)

head(model_LDA$top_terms)

# get the prevalence of each topic
model_LDA$prevalence <- colSums(model_LDA$theta) / sum(model_LDA$theta) * 100

# naive topic labeling based on probable bigrams
model_LDA$labels <- LabelTopics(assignments = model_LDA$theta > 0.05, 
                            dtm = dtm,
                            M = 1)

head(model_LDA$labels)

# summary table: Table A.1 in the paper
summary_LDA <- data.frame(topic = rownames(model_LDA$phi),
                            label = model_LDA$labels,
                            coherence = round(model_LDA$coherence, 3),
                            prevalence = round(model_LDA$prevalence,3),
                            top_terms = apply(model_LDA$top_terms, 2, function(x){
                              paste(x, collapse = ", ")
                            }),
                            stringsAsFactors = FALSE)

write_csv(summary_LDA, "summary_LDA.csv")

#summary[order(summary_LDA$prevalence, decreasing = TRUE) , ][ 1:10 , ]

# predictions with dot
assignments_dot <- predict(model_LDA, dtm,
                           method = "dot")

mmad_events_data <- cbind(mmad_events_btm, assignments_dot)
```

Recode repression to binary indicator and extract year var; additional cleanup
```{r}
# recode repression to binary indicator
mmad_events_data$repression_binary <- ifelse(mmad_events_data$max_repression < 2, 0, 1)

# create variable that identifies events without repression
mmad_events_data$no_repression <- ifelse(mmad_events_data$max_repression > 1, 0, 1)

mmad_events_data$topic_NA <- ifelse(is.na(mmad_events_data$topic1), 1, 0)

# extract year
mmad_events_data$year <- as.numeric(substring(mmad_events_data$event_date, 1, 4))

# delete events with part number <25
mmad_events_data <- mmad_events_data %>% filter(avg_participants > 24)
```

Create issue indices by taking max of either topic that relates to broader issue
```{r}
mmad_events_data <- mmad_events_data %>% 
  mutate(issue_performance = pmax(topic3, topic8), 
         issue_rat_legal = pmax(topic5, topic6),
         issue_leader = topic4,
         issue_regime = topic7,
         issue_repression = topic2)
```

Add event history variables
````{r}
mmad_events_data <- data.table::as.data.table(mmad_events_data)
## count the number of events in last 21 days, not including "today"
# in country
mmad_events_data[, event_history_country := rollingByCalcs(mmad_events_data, dates = "event_date", by = "cowcode", 
                                     lower = 0, upper = 22, incbounds = F, stat = sum)]
# in location
mmad_events_data[, event_history_location := rollingByCalcs(mmad_events_data, dates = "event_date", by = "location",
                                     lower = 0, upper = 22, incbounds = F, stat = sum)]

## count the number of repressed protest events in the preceding 7, 14 and 21 days
# in country
mmad_events_data[, repr_hist_country_7 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "cowcode", target = "repression_binary",
                                     lower = 0, upper = 8, incbounds = F, stat = sum)]

mmad_events_data[, repr_hist_country_21 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "cowcode", target = "repression_binary",
                                     lower = 0, upper = 22, incbounds = F, stat = sum)]

# in location
mmad_events_data[, repr_hist_loc_7 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "location", target = "repression_binary",
                                     lower = 0, upper = 8, incbounds = F, stat = sum)]

mmad_events_data[, repr_hist_loc_21 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "location", target = "repression_binary",
                                     lower = 0, upper = 22, incbounds = F, stat = sum)]

## calculate the number of unrepressed protest events in the preceding 7, 14 and 21 days
# in country
mmad_events_data[, unrepr_hist_country_7 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "cowcode", target = "no_repression",
                                     lower = 0, upper = 8, incbounds = F, stat = sum)]

mmad_events_data[, unrepr_hist_country_21 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "cowcode", target = "no_repression",
                                     lower = 0, upper = 22, incbounds = F, stat = sum)]

# in location
mmad_events_data[, unrepr_hist_loc_7 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "location", target = "no_repression",
                                     lower = 0, upper = 8, incbounds = F, stat = sum)]

mmad_events_data[, unrepr_hist_loc_21 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "location", target = "no_repression",
                                     lower = 0, upper = 22, incbounds = F, stat = sum)]


## calculate average strength of leader issue in preceding 7, 14, and 21 days
# in country
mmad_events_data[, lead_hist_country_7 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "cowcode", target = "issue_leader",
                                     lower = 0, upper = 8, incbounds = F, stat = mean, na.rm = T)]

mmad_events_data[, lead_hist_country_21 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "cowcode", target = "issue_leader",
                                     lower = 0, upper = 22, incbounds = F, stat = mean, na.rm = T)]

# in location
mmad_events_data[, lead_hist_loc_7 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "location", target = "issue_leader",
                                     lower = 0, upper = 8, incbounds = F, stat = mean, na.rm = T)]

mmad_events_data[, lead_hist_loc_21 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "location", target = "issue_leader",
                                     lower = 0, upper = 22, incbounds = F, stat = mean, na.rm = T)]

## calculate average strength of repression topic in preceding 7, 14, and 21 days
# in country
mmad_events_data[, reptopic_hist_country_7 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "cowcode", target = "topic2",
                                     lower = 0, upper = 8, incbounds = F, stat = mean, na.rm = T)]

mmad_events_data[, reptopic_hist_country_21 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "cowcode", target = "topic2",
                                     lower = 0, upper = 22, incbounds = F, stat = mean, na.rm = T)]

# in location
mmad_events_data[, reptopic_hist_loc_7 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "location", target = "topic2",
                                     lower = 0, upper = 8, incbounds = F, stat = mean, na.rm = T)]

mmad_events_data[, reptopic_hist_loc_21 := boRingTrees::rollingByCalcs(mmad_events_data, dates = "event_date", by = "location", target = "topic2",
                                     lower = 0, upper = 22, incbounds = F, stat = mean, na.rm = T)]

mmad_events_data <- as.data.frame(mmad_events_data)

# define missing values
mmad_events_data <- mmad_events_data %>% 
  mutate(lead_hist_country_7 = na_if(lead_hist_country_7, "NaN"),
         lead_hist_country_21 = na_if(lead_hist_country_21, "NaN"),
         lead_hist_loc_7 = na_if(lead_hist_loc_7, "NaN"),
         lead_hist_loc_21 = na_if(lead_hist_loc_21, "NaN"),
         reptopic_hist_country_7 = na_if(reptopic_hist_country_7, "NaN"),
         reptopic_hist_country_21 = na_if(reptopic_hist_country_21, "NaN"),
         reptopic_hist_loc_7 = na_if(reptopic_hist_loc_7,"NaN"),
         reptopic_hist_loc_21 = na_if(reptopic_hist_loc_21,"NaN"))
```


Join mmad events data with vdem data on legitimacy claims and country-level covariates (dataset: leg_cov_data)
```{r}
# change class of cowcode and year vars for join
mmad_events_data <- mmad_events_data %>% 
  mutate(cowcode = as.factor(cowcode),
         year = as.factor(year))

# load covariate data
load("leg_cov_data.Rda")

data <- mmad_events_data %>% 
  left_join(leg_cov_data, by = c("cowcode", "year"))
```


Calculate issue and legitimacy claim strengths relative to others
```{r}
data <- data %>% 
  rowwise() %>% 
  mutate(issue_performance_rel = issue_performance - (sort(c(issue_rat_legal, issue_leader),T)[1]),
         issue_rat_legal_rel = issue_rat_legal - (sort(c(issue_performance, issue_leader),T)[1]),
         issue_leader_rel = issue_leader - (sort(c(issue_performance, issue_rat_legal),T)[1])) %>% 
  mutate(v2exl_legitperf_rel = v2exl_legitperf - (sort(c(v2exl_legitratio, v2exl_legitlead),T)[1]),
         v2exl_legitratio_rel = v2exl_legitratio - (sort(c(v2exl_legitperf, v2exl_legitlead),T)[1]),
         v2exl_legitlead_rel = v2exl_legitlead - (sort(c(v2exl_legitperf, v2exl_legitratio),T)[1]))
```

Strongest issues and strongest claims as dummies
```{r}
# reorder vars for col selection
data <- data %>% select(event_id, event_date, location, cowcode, year, all_issues, max_repression, scope, part_violence, avg_participants, actors, repression_binary, issue_performance, issue_rat_legal, issue_leader, issue_regime, issue_repression, v2exl_legitlead, v2exl_legitperf, v2exl_legitratio, v2exl_legitideol, everything())
                        
data$max_claim <- colnames(data[18:21])[max.col(data[18:21],ties.method="random")]
data$max_issue <- colnames(data[13:17])[max.col(data[13:17],ties.method="random")]

data <- data %>% 
  mutate(max_claim_perf = if_else(max_claim == "v2exl_legitperf", 1, 0),
         max_claim_lead = if_else(max_claim == "v2exl_legitlead", 1, 0),
         max_claim_ratio = if_else(max_claim == "v2exl_legitratio", 1, 0),
         max_claim_ideol = if_else(max_claim == "v2exl_legitideol", 1, 0),
         max_issue_perf = if_else(max_issue == "issue_performance", 1, 0),
         max_issue_lead = if_else(max_issue == "issue_leader", 1, 0),
         max_issue_ratio = if_else(max_issue == "issue_rat_legal", 1, 0),
         max_issue_regime = if_else(max_issue == "issue_regime", 1, 0),
         max_issue_repression = if_else(max_issue == "issue_repression", 1, 0))
```

Save final dataset
```{r}
save(data, file = "data.Rda")
```

Generate subsets for analysis in Stata
```{r}
stata_leader_ord <- data %>% 
  mutate(legitlead_ord_high = if_else(v2exl_legitlead_ord > 2, 1, 0)) %>% 
  select(event_id, event_date, location, cowcode, year, repression_binary, issue_performance, issue_rat_legal,
         issue_leader,issue_regime, v2exl_legitperf, v2exl_legitratio, v2exl_legitlead, part_violence,
         avg_participants, scope, event_history_location, repr_hist_country_21, unrepr_hist_country_21,
         repr_hist_loc_21, unrepr_hist_loc_21, topic2,
         issue_repression, gdppc, pop, v2x_polyarchy, v2x_polyarchy,
         theta_mean, v2x_ex_military, v2x_ex_party, armed_confl, legitlead_ord_high, v2exl_legitlead_ord)  %>% 
  filter(legitlead_ord_high == 1)

stata_noleader_ord <- data %>% 
  mutate(legitlead_ord_high = if_else(v2exl_legitlead_ord > 2, 1, 0)) %>% 
  select(event_id, event_date, location, cowcode, year, repression_binary, issue_performance, issue_rat_legal,
         issue_leader, issue_regime, v2exl_legitperf, v2exl_legitratio, v2exl_legitlead, part_violence,
         avg_participants, scope,  event_history_location, repr_hist_country_21, unrepr_hist_country_21,
         repr_hist_loc_21, unrepr_hist_loc_21, topic2, issue_repression, gdppc, pop,v2x_polyarchy,
         v2x_polyarchy, theta_mean, v2x_ex_military, v2x_ex_party, armed_confl,
         legitlead_ord_high, v2exl_legitlead_ord)  %>% 
  filter(legitlead_ord_high == 0)

# write output as Stata files
haven::write_dta(stata_leader_ord, "data_subset_leader_ord.dta")
haven::write_dta(stata_noleader_ord, "data_subset_noleader_ord.dta")
```


